In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pylab as py
In [2]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [3]:
cdf=pd.read_csv('train (1).csv')
In [4]:
cdf.head()
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [6]:
cdf.sample(10)
Out[6]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
265 266 0 2 Reeves, Mr. David male 36.0 0 0 C.A. 17248 10.5000 NaN S
571 572 1 1 Appleton, Mrs. Edward Dale (Charlotte Lamson) female 53.0 2 0 11769 51.4792 C101 S
225 226 0 3 Berglund, Mr. Karl Ivar Sven male 22.0 0 0 PP 4348 9.3500 NaN S
808 809 0 2 Meyer, Mr. August male 39.0 0 0 248723 13.0000 NaN S
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
153 154 0 3 van Billiard, Mr. Austin Blyler male 40.5 0 2 A/5. 851 14.5000 NaN S
56 57 1 2 Rugg, Miss. Emily female 21.0 0 0 C.A. 31026 10.5000 NaN S
68 69 1 3 Andersson, Miss. Erna Alexandra female 17.0 4 2 3101281 7.9250 NaN S
585 586 1 1 Taussig, Miss. Ruth female 18.0 0 2 110413 79.6500 E68 S
486 487 1 1 Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby) female 35.0 1 0 19943 90.0000 C93 S
In [7]:
cdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
In [9]:
cdf.isnull().sum()
Out[9]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
In [11]:
fig=py.figure(figsize=(15,15))
sns.heatmap(cdf.isnull(),yticklabels=False,)
Out[11]:
<AxesSubplot:>
In [12]:
cdf.head()
Out[12]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [15]:
sns.set(style='darkgrid')
sns.countplot(x='Survived',data=cdf)
Out[15]:
<AxesSubplot:xlabel='Survived', ylabel='count'>
In [16]:
sns.countplot(x='Survived',data=cdf,hue='Pclass')
Out[16]:
<AxesSubplot:xlabel='Survived', ylabel='count'>
In [20]:
sns.displot(cdf['Age'],height=8,bins=30)
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x22d5ab3f9a0>
In [21]:
cdf.head()
Out[21]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [22]:
sns.countplot(x='SibSp',data=cdf)
Out[22]:
<AxesSubplot:xlabel='SibSp', ylabel='count'>
In [28]:
cdf['Fare'].plot(kind='hist',bins=30,)
Out[28]:
<AxesSubplot:ylabel='Frequency'>
In [29]:
import cufflinks as cf
cf.go_offline(connected=False)
In [30]:
cdf['Fare'].iplot(kind='hist')
In [31]:
cdf.head()
Out[31]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [32]:
sns.boxplot(x='Pclass',y='Age',data=cdf)
Out[32]:
<AxesSubplot:xlabel='Pclass', ylabel='Age'>
In [35]:
print(cdf[cdf['Pclass']==1]['Age'].mean())
print('\n')
print(cdf[cdf['Pclass']==2]['Age'].mean())
print('\n')
print(cdf[cdf['Pclass']==3]['Age'].mean())
38.233440860215055


29.87763005780347


25.14061971830986
In [36]:
def impute(new_age):
    Age=new_age[0]
    Pclass=new_age[1]
    if pd.isnull(Age):
        if Pclass==1:
            return int(cdf[cdf['Pclass']==1]['Age'].mean())
        elif Pclass==2:
            return int(cdf[cdf['Pclass']==2]['Age'].mean())
        else:
            return int(cdf[cdf['Pclass']==3]['Age'].mean())
    else:
        return Age
In [37]:
int(cdf[cdf['Pclass']==1]['Age'].mean())
Out[37]:
38
In [40]:
cdf['new_age']=cdf[['Age','Pclass']].apply(impute, axis=1)
In [41]:
cdf
Out[41]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked new_age
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 22.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 38.0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 26.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 35.0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 35.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 27.0
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 19.0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 25.0
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 26.0
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q 32.0

891 rows × 13 columns

In [42]:
c=cdf['Name'].loc[1]
In [43]:
c
Out[43]:
'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
In [51]:
c.
Out[51]:
'Mrs.'
In [52]:
def tit(le):
    return le.split(' ')[1].split(' ')[0]
In [53]:
cdf['title']=cdf['Name'].apply(tit)
In [57]:
cdf.groupby(by='title').count()['new_age'].plot()
Out[57]:
<AxesSubplot:xlabel='title'>
In [58]:
cdf.head()
Out[58]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked new_age title
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 22.0 Mr.
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 38.0 Mrs.
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 26.0 Miss.
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 35.0 Mrs.
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 35.0 Mr.
In [59]:
cdf=cdf.drop(['PassengerId','Name','Age','Ticket','Cabin'],axis=1)
In [60]:
cdf
Out[60]:
Survived Pclass Sex SibSp Parch Fare Embarked new_age title
0 0 3 male 1 0 7.2500 S 22.0 Mr.
1 1 1 female 1 0 71.2833 C 38.0 Mrs.
2 1 3 female 0 0 7.9250 S 26.0 Miss.
3 1 1 female 1 0 53.1000 S 35.0 Mrs.
4 0 3 male 0 0 8.0500 S 35.0 Mr.
... ... ... ... ... ... ... ... ... ...
886 0 2 male 0 0 13.0000 S 27.0 Rev.
887 1 1 female 0 0 30.0000 S 19.0 Miss.
888 0 3 female 1 2 23.4500 S 25.0 Miss.
889 1 1 male 0 0 30.0000 C 26.0 Mr.
890 0 3 male 0 0 7.7500 Q 32.0 Mr.

891 rows × 9 columns

In [61]:
cd=pd.get_dummies(cdf,drop_first=True)
In [62]:
cd.head()
Out[62]:
Survived Pclass SibSp Parch Fare new_age Sex_male Embarked_Q Embarked_S title_Capt. ... title_Pelsmaeker, title_Planke, title_Rev. title_Shawah, title_Steen, title_Velde, title_Walle, title_der title_the title_y
0 0 3 1 0 7.2500 22.0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
1 1 1 1 0 71.2833 38.0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 3 0 0 7.9250 26.0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 1 1 0 53.1000 35.0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 3 0 0 8.0500 35.0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 39 columns

In [63]:
from sklearn.ensemble import ExtraTreesClassifier
In [83]:
cd.head()
Out[83]:
Survived Pclass SibSp Parch Fare new_age Sex_male Embarked_Q Embarked_S title_Capt. ... title_Pelsmaeker, title_Planke, title_Rev. title_Shawah, title_Steen, title_Velde, title_Walle, title_der title_the title_y
0 0 3 1 0 7.2500 22.0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
1 1 1 1 0 71.2833 38.0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 3 0 0 7.9250 26.0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 1 1 0 53.1000 35.0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 3 0 0 8.0500 35.0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 39 columns

In [91]:
X=cd.drop('Survived',axis=1)
y=cd['Survived']
In [92]:
from sklearn.model_selection import train_test_split
In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
In [94]:
from sklearn.linear_model import LogisticRegression
In [97]:
logmodel=LogisticRegression(max_iter=500)
In [98]:
logmodel.fit(X_train,y_train)
Out[98]:
LogisticRegression(max_iter=500)
In [99]:
predict=logmodel.predict(X_test)
In [100]:
from sklearn.metrics import classification_report , confusion_matrix
In [101]:
print(classification_report(y_test,predict))
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       169
           1       0.87      0.73      0.79       126

    accuracy                           0.84       295
   macro avg       0.84      0.82      0.83       295
weighted avg       0.84      0.84      0.83       295

In [102]:
print(confusion_matrix(y_test,predict))
[[155  14]
 [ 34  92]]
In [104]:
sns.heatmap(confusion_matrix(y_test,predict),annot=True)
Out[104]:
<AxesSubplot:>
In [ ]: